{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# **WordLift** Vector Store\n",
    "\n",
    "## Introduction\n",
    "This script demonstrates how to crawl a product website, extract relevant information, build an SEO-friendly Knowledge Graph (a structured representation of PDPs and PLPs), and leverage it for improved search and user experience.\n",
    "\n",
    "### Key Features & Libraries:\n",
    "\n",
    "- Web scraping (Advertools)\n",
    "- Knowledge Graph creation for Product Detail Pages (PDPs) and Product Listing Pages (PLPs) - WordLift\n",
    "- Product recommendations (WordLift Neural Search)\n",
    "- Shopping assistant creation (WordLift + LlamaIndex 🦙)\n",
    "\n",
    "This approach enhances SEO performance and user engagement for e-commerce sites.\n",
    "\n",
    "Learn more about how it works here:\n",
    "- [https://www.youtube.com/watch?v=CH-ir1MTAwQ](https://www.youtube.com/watch?v=CH-ir1MTAwQ)\n",
    "- [https://wordlift.io/academy-entries/mastering-serp-analysis-knowledge-graphs](https://wordlift.io/academy-entries/mastering-serp-analysis-knowledge-graphs)\n",
    "\n",
    "</br></br>\n",
    "<table align=\"left\">\n",
    "  <td>\n",
    "  <a href=\"https://wordlift.io\">\n",
    "    <img width=130px src=\"https://wordlift.io/wp-content/uploads/2018/07/logo-assets-510x287.png\" />\n",
    "    </a>\n",
    "    </td>\n",
    "    <td>\n",
    "      by\n",
    "      <a href=\"https://wordlift.io/blog/en/entity/andrea-volpini\">\n",
    "        Andrea Volpini\n",
    "      </a>\n",
    "      and\n",
    "      <a href=\"https://wordlift.io/blog/en/entity/david-riccitelli\">\n",
    "        David Riccitelli\n",
    "      </a>      \n",
    "      <br/>\n",
    "      <br/>\n",
    "      MIT License\n",
    "      <br/>\n",
    "      <br/>\n",
    "      <i>Last updated: <b>Jul 31st, 2024</b></i>\n",
    "  </td>\n",
    "</table>\n",
    "</br>\n",
    "</br>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install advertools -q\n",
    "!pip install -U wordlift-client # 🎉 first time on stage 🎉\n",
    "!pip install rdflib -q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Standard library imports\n",
    "import json\n",
    "import logging\n",
    "import os\n",
    "import re\n",
    "import urllib.parse\n",
    "import requests\n",
    "from typing import List, Optional\n",
    "\n",
    "# Third-party imports\n",
    "import advertools as adv\n",
    "import pandas as pd\n",
    "import nest_asyncio\n",
    "\n",
    "# RDFLib imports\n",
    "from rdflib import Graph, Literal, RDF, URIRef\n",
    "from rdflib.namespace import SDO, Namespace, DefinedNamespace\n",
    "\n",
    "# WordLift client imports\n",
    "import wordlift_client\n",
    "from wordlift_client import Configuration, ApiClient\n",
    "from wordlift_client.rest import ApiException\n",
    "from wordlift_client.api.dataset_api import DatasetApi\n",
    "from wordlift_client.api.entities_api import EntitiesApi\n",
    "from wordlift_client.api.graph_ql_api import GraphQLApi\n",
    "from wordlift_client.models.graphql_request import GraphqlRequest\n",
    "from wordlift_client.models.page_vector_search_query_response_item import (\n",
    "    PageVectorSearchQueryResponseItem,\n",
    ")\n",
    "from wordlift_client.models.vector_search_query_request import (\n",
    "    VectorSearchQueryRequest,\n",
    ")\n",
    "from wordlift_client.api.vector_search_queries_api import (\n",
    "    VectorSearchQueriesApi,\n",
    ")\n",
    "\n",
    "\n",
    "# Asynchronous programming\n",
    "import asyncio\n",
    "\n",
    "# Set up logging\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "logger = logging.getLogger(__name__)\n",
    "\n",
    "# Apply nest_asyncio\n",
    "nest_asyncio.apply()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "WORDLIFT_KEY = os.getenv(\"WORDLIFT_KEY\")\n",
    "OPENAI_KEY = os.getenv(\"OPENAI_KEY\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Crawl the Website w/ Advertools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 1: Define the website structure\n",
    "# -----------------------------------\n",
    "\n",
    "# We're working with two types of pages:\n",
    "# 1. Product Listing Pages (PLP): https://product-finder.wordlift.io/product-category/bags/\n",
    "# 2. Product Detail Pages (PDP): https://product-finder.wordlift.io/product/1980s-marco-polo-crossbody-bag-in-black/\n",
    "\n",
    "# The product description can be found at this XPath:\n",
    "# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()\n",
    "# The price is here:\n",
    "# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()\n",
    "# The category is here:\n",
    "# //span[contains(@class, 'breadcrumb')]/a/text()\n",
    "\n",
    "# Step 2: Set up the crawl\n",
    "# ------------------------\n",
    "\n",
    "\n",
    "def crawl_website(url, output_file, num_pages=10):\n",
    "    logger.info(f\"Starting crawl of {url}\")\n",
    "    adv.crawl(\n",
    "        url,\n",
    "        output_file,\n",
    "        follow_links=True,\n",
    "        custom_settings={\n",
    "            \"CLOSESPIDER_PAGECOUNT\": num_pages,\n",
    "            \"USER_AGENT\": \"WordLiftBot/1.0 (Maven Project)\",\n",
    "            \"CONCURRENT_REQUESTS_PER_DOMAIN\": 2,\n",
    "            \"DOWNLOAD_DELAY\": 1,\n",
    "            \"ROBOTSTXT_OBEY\": False,\n",
    "        },\n",
    "        xpath_selectors={\n",
    "            \"product_description\": \"/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()\",\n",
    "            \"product_price\": \"/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()\",\n",
    "            \"product_category\": \"//span[@class='posted_in']/a/text()\",\n",
    "        },\n",
    "    )\n",
    "    logger.info(f\"Crawl completed. Results saved to {output_file}\")\n",
    "\n",
    "\n",
    "# Step 3: Analyze URL patterns\n",
    "# ----------------------------\n",
    "\n",
    "\n",
    "def analyze_url_patterns(df):\n",
    "    df[\"page_type\"] = df[\"url\"].apply(\n",
    "        lambda x: \"PLP\"\n",
    "        if \"/product-category/\" in x\n",
    "        else (\"PDP\" if \"/product/\" in x else \"Other\")\n",
    "    )\n",
    "    logger.info(\n",
    "        f\"Found {(df['page_type'] == 'PLP').sum()} PLPs and {(df['page_type'] == 'PDP').sum()} PDPs\"\n",
    "    )\n",
    "    return df\n",
    "\n",
    "\n",
    "# Step 4: Extract page data\n",
    "# ----------------------------\n",
    "\n",
    "\n",
    "def extract_page_data(df):\n",
    "    extracted_data = []\n",
    "    for _, row in df.iterrows():\n",
    "        page = {\n",
    "            \"url\": row[\"url\"],\n",
    "            \"title\": row[\"title\"],\n",
    "            \"page_type\": row[\"page_type\"],\n",
    "            \"meta_description\": row.get(\"meta_description\", \"\"),\n",
    "            \"og_title\": row.get(\"og_title\", \"\"),\n",
    "            \"og_description\": row.get(\"og_description\", \"\"),\n",
    "            \"h1\": \", \".join(row.get(\"h1\", []))\n",
    "            if isinstance(row.get(\"h1\"), list)\n",
    "            else row.get(\"h1\", \"\"),\n",
    "            \"h2\": \", \".join(row.get(\"h2\", []))\n",
    "            if isinstance(row.get(\"h2\"), list)\n",
    "            else row.get(\"h2\", \"\"),\n",
    "        }\n",
    "\n",
    "        if row[\"page_type\"] == \"PDP\":\n",
    "            page.update(\n",
    "                {\n",
    "                    \"product_description\": \", \".join(\n",
    "                        row.get(\"product_description\", [])\n",
    "                    )\n",
    "                    if isinstance(row.get(\"product_description\"), list)\n",
    "                    else row.get(\"product_description\", \"\"),\n",
    "                    \"product_price\": \", \".join(row.get(\"product_price\", []))\n",
    "                    if isinstance(row.get(\"product_price\"), list)\n",
    "                    else row.get(\"product_price\", \"\"),\n",
    "                    \"product_category\": \", \".join(\n",
    "                        row.get(\"product_category\", [])\n",
    "                    )\n",
    "                    if isinstance(row.get(\"product_category\"), list)\n",
    "                    else row.get(\"product_category\", \"\"),\n",
    "                }\n",
    "            )\n",
    "        elif row[\"page_type\"] == \"PLP\":\n",
    "            # Parse the category from the H1 content\n",
    "            h1_content = (\n",
    "                row.get(\"h1\", [\"\"])[0]\n",
    "                if isinstance(row.get(\"h1\"), list)\n",
    "                else row.get(\"h1\", \"\")\n",
    "            )\n",
    "            category = (\n",
    "                h1_content.split(\"@@\")[-1]\n",
    "                if \"@@\" in h1_content\n",
    "                else h1_content.replace(\"Category: \", \"\").strip()\n",
    "            )\n",
    "            page[\"category_name\"] = category\n",
    "\n",
    "        extracted_data.append(page)\n",
    "\n",
    "    return pd.DataFrame(extracted_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build the KG w/ WordLift 🕸"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 5: Configure the WordLift client\n",
    "# ----------------------------\n",
    "\n",
    "# Create a configuration object for the WordLift API client using your WordLift key.\n",
    "configuration = Configuration(host=\"https://api.wordlift.io\")\n",
    "configuration.api_key[\"ApiKey\"] = WORDLIFT_KEY\n",
    "configuration.api_key_prefix[\"ApiKey\"] = \"Key\"\n",
    "\n",
    "EXAMPLE_PRIVATE_NS = Namespace(\"https://ns.example.org/private/\")\n",
    "\n",
    "BASE_URI = \"http://data.wordlift.io/[dataset_id]/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 6: Build the KG and the embeddings\n",
    "# ----------------------------\n",
    "\n",
    "\n",
    "async def cleanup_knowledge_graph(api_client):\n",
    "    dataset_api = wordlift_client.DatasetApi(api_client)\n",
    "    try:\n",
    "        # Delete all\n",
    "        await dataset_api.delete_all_entities()\n",
    "    except Exception as e:\n",
    "        print(\n",
    "            \"Exception when calling DatasetApi->delete_all_entities: %s\\n\" % e\n",
    "        )\n",
    "\n",
    "\n",
    "async def create_entity(entities_api, entity_data):\n",
    "    g = Graph().parse(data=json.dumps(entity_data), format=\"json-ld\")\n",
    "    body = g.serialize(format=\"application/rdf+xml\")\n",
    "    await entities_api.create_or_update_entities(\n",
    "        body=body, _content_type=\"application/rdf+xml\"\n",
    "    )\n",
    "\n",
    "\n",
    "def replace_url(original_url: str) -> str:\n",
    "    old_domain = \"https://product-finder.wordlift.io/\"\n",
    "    new_domain = \"https://data-science-with-python-for-seo.wordlift.dev/\"\n",
    "\n",
    "    if original_url.startswith(old_domain):\n",
    "        return original_url.replace(old_domain, new_domain, 1)\n",
    "    else:\n",
    "        return original_url\n",
    "\n",
    "\n",
    "def create_entity_uri(url):\n",
    "    parsed_url = urllib.parse.urlparse(url)\n",
    "    path = parsed_url.path.strip(\"/\")\n",
    "    path_parts = path.split(\"/\")\n",
    "    fragment = parsed_url.fragment\n",
    "\n",
    "    if \"product\" in path_parts:\n",
    "        # It's a product page or product offer\n",
    "        product_id = path_parts[-1]  # Get the last part of the path\n",
    "        if fragment == \"offer\":\n",
    "            return f\"{BASE_URI}offer_{product_id}\"\n",
    "        else:\n",
    "            return f\"{BASE_URI}product_{product_id}\"\n",
    "    elif \"product-category\" in path_parts:\n",
    "        # It's a product listing page (PLP)\n",
    "        category = path_parts[-1]  # Get the last part of the path\n",
    "        return f\"{BASE_URI}plp_{category}\"\n",
    "    else:\n",
    "        # For any other type of page\n",
    "        safe_path = \"\".join(c if c.isalnum() else \"_\" for c in path)\n",
    "        if fragment == \"offer\":\n",
    "            return f\"{BASE_URI}offer_{safe_path}\"\n",
    "        else:\n",
    "            return f\"{BASE_URI}page_{safe_path}\"\n",
    "\n",
    "\n",
    "def clean_price(price_str):\n",
    "    if not price_str or price_str == \"N/A\":\n",
    "        return None\n",
    "    if isinstance(price_str, (int, float)):\n",
    "        return float(price_str)\n",
    "    try:\n",
    "        # Remove any non-numeric characters except for the decimal point\n",
    "        cleaned_price = \"\".join(\n",
    "            char for char in str(price_str) if char.isdigit() or char == \".\"\n",
    "        )\n",
    "        return float(cleaned_price)\n",
    "    except ValueError:\n",
    "        logger.warning(f\"Could not convert price: {price_str}\")\n",
    "        return None\n",
    "\n",
    "\n",
    "def create_product_entity(row, dataset_uri):\n",
    "    url = replace_url(row[\"url\"])\n",
    "    product_entity_uri = create_entity_uri(url)\n",
    "\n",
    "    entity_data = {\n",
    "        \"@context\": \"http://schema.org\",\n",
    "        \"@type\": \"Product\",\n",
    "        \"@id\": product_entity_uri,\n",
    "        \"url\": url,\n",
    "        \"name\": row[\"title\"]\n",
    "        if not pd.isna(row[\"title\"])\n",
    "        else \"Untitled Product\",\n",
    "        \"urn:meta:requestEmbeddings\": [\n",
    "            \"http://schema.org/name\",\n",
    "            \"http://schema.org/description\",\n",
    "        ],\n",
    "    }\n",
    "\n",
    "    if not pd.isna(row.get(\"product_description\")):\n",
    "        entity_data[\"description\"] = row[\"product_description\"]\n",
    "\n",
    "    if not pd.isna(row.get(\"product_price\")):\n",
    "        price = clean_price(row[\"product_price\"])\n",
    "        if price is not None:\n",
    "            # Create offer ID as a sub-resource of the product ID\n",
    "            offer_entity_uri = f\"{product_entity_uri}/offer_1\"\n",
    "            entity_data[\"offers\"] = {\n",
    "                \"@type\": \"Offer\",\n",
    "                \"@id\": offer_entity_uri,\n",
    "                \"price\": str(price),\n",
    "                \"priceCurrency\": \"GBP\",\n",
    "                \"availability\": \"http://schema.org/InStock\",\n",
    "                \"url\": url,\n",
    "            }\n",
    "\n",
    "    if not pd.isna(row.get(\"product_category\")):\n",
    "        entity_data[\"category\"] = row[\"product_category\"]\n",
    "\n",
    "    custom_attributes = {\n",
    "        key: row[key]\n",
    "        for key in [\n",
    "            \"meta_description\",\n",
    "            \"og_title\",\n",
    "            \"og_description\",\n",
    "            \"h1\",\n",
    "            \"h2\",\n",
    "        ]\n",
    "        if not pd.isna(row.get(key))\n",
    "    }\n",
    "    if custom_attributes:\n",
    "        entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(\n",
    "            custom_attributes\n",
    "        )\n",
    "\n",
    "    return entity_data\n",
    "\n",
    "\n",
    "def create_collection_entity(row, dataset_uri):\n",
    "    url = replace_url(row[\"url\"])\n",
    "    entity_uri = create_entity_uri(url)\n",
    "\n",
    "    entity_data = {\n",
    "        \"@context\": \"http://schema.org\",\n",
    "        \"@type\": \"CollectionPage\",\n",
    "        \"@id\": entity_uri,\n",
    "        \"url\": url,\n",
    "        \"name\": row[\"category_name\"] or row[\"title\"],\n",
    "    }\n",
    "\n",
    "    custom_attributes = {\n",
    "        key: row[key]\n",
    "        for key in [\n",
    "            \"meta_description\",\n",
    "            \"og_title\",\n",
    "            \"og_description\",\n",
    "            \"h1\",\n",
    "            \"h2\",\n",
    "        ]\n",
    "        if row.get(key)\n",
    "    }\n",
    "    if custom_attributes:\n",
    "        entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(\n",
    "            custom_attributes\n",
    "        )\n",
    "\n",
    "    return entity_data\n",
    "\n",
    "\n",
    "async def build_knowledge_graph(df, dataset_uri, api_client):\n",
    "    entities_api = EntitiesApi(api_client)\n",
    "\n",
    "    for _, row in df.iterrows():\n",
    "        try:\n",
    "            if row[\"page_type\"] == \"PDP\":\n",
    "                entity_data = create_product_entity(row, dataset_uri)\n",
    "            elif row[\"page_type\"] == \"PLP\":\n",
    "                entity_data = create_collection_entity(row, dataset_uri)\n",
    "            else:\n",
    "                logger.warning(\n",
    "                    f\"Skipping unknown page type for URL: {row['url']}\"\n",
    "                )\n",
    "                continue\n",
    "\n",
    "            if entity_data is None:\n",
    "                logger.warning(\n",
    "                    f\"Skipping page due to missing critical data: {row['url']}\"\n",
    "                )\n",
    "                continue\n",
    "\n",
    "            await create_entity(entities_api, entity_data)\n",
    "            logger.info(\n",
    "                f\"Created entity for {row['page_type']}: {row['title']}\"\n",
    "            )\n",
    "        except Exception as e:\n",
    "            logger.error(\n",
    "                f\"Error creating entity for {row['page_type']}: {row['title']}\"\n",
    "            )\n",
    "            logger.error(f\"Error: {str(e)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Run the show"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ----------------------------\n",
    "# Main Execution\n",
    "# ----------------------------\n",
    "\n",
    "# Global configuration variables\n",
    "CRAWL_URL = \"https://product-finder.wordlift.io/\"\n",
    "OUTPUT_FILE = \"crawl_results.jl\"\n",
    "\n",
    "\n",
    "async def main():\n",
    "    # Step 1: Crawl the website\n",
    "    crawl_website(CRAWL_URL, OUTPUT_FILE)\n",
    "\n",
    "    # Step 2: Load the crawled data\n",
    "    df = pd.read_json(OUTPUT_FILE, lines=True)\n",
    "\n",
    "    # Step 3: Analyze URL patterns\n",
    "    df = analyze_url_patterns(df)\n",
    "\n",
    "    # Step 4: Extract page data\n",
    "    pages_df = extract_page_data(df)\n",
    "\n",
    "    async with ApiClient(configuration) as api_client:\n",
    "        # Clean up the existing knowledge graph\n",
    "        try:\n",
    "            await cleanup_knowledge_graph(api_client)\n",
    "            logger.info(f\"Knowledge Graph Cleaned Up\")\n",
    "        except Exception as e:\n",
    "            logger.error(\n",
    "                f\"Failed to clean up the existing Knowledge Graph: {str(e)}\"\n",
    "            )\n",
    "            return  # Exit if cleanup fails\n",
    "\n",
    "        # Build the new knowledge graph\n",
    "        await build_knowledge_graph(pages_df, CRAWL_URL, api_client)\n",
    "\n",
    "    logger.info(\"Knowledge graph building completed.\")\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    asyncio.run(main())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Let's query products in the KG now using GraphQL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "async def perform_graphql_query(api_client):\n",
    "    graphql_api = GraphQLApi(api_client)\n",
    "    query = \"\"\"\n",
    "    {\n",
    "        products(rows: 20) {\n",
    "            id: iri\n",
    "            category: string(name:\"schema:category\")\n",
    "            name: string(name:\"schema:name\")\n",
    "            description: string(name:\"schema:description\")\n",
    "            url: string(name:\"schema:url\")\n",
    "        }\n",
    "    }\n",
    "    \"\"\"\n",
    "    request = GraphqlRequest(query=query)\n",
    "\n",
    "    try:\n",
    "        response = await graphql_api.graphql_using_post(body=request)\n",
    "        print(\"GraphQL Query Results:\")\n",
    "        print(json.dumps(response, indent=2))\n",
    "    except Exception as e:\n",
    "        logger.error(f\"An error occurred during GraphQL query: {e}\")\n",
    "\n",
    "\n",
    "async with ApiClient(configuration) as api_client:\n",
    "    # Step 6: Perform GraphQL query\n",
    "    await perform_graphql_query(api_client)\n",
    "    logger.info(\"Knowledge graph building and GraphQL query completed.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Leveraging the Knowledge Graph\n",
    "\n",
    "Now that we have successfully created a Knowledge Graph for our e-commerce website, complete with product embeddings, we can take advantage of it to enhance user experience and functionality. The embeddings we've generated for each product allow us to perform semantic similarity searches and build more intelligent systems.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Adding Structured Data to your Web Pages\n",
    "\n",
    "In this section, we will perform a simple test of WordLift's data API. This API is used to inject structured data markup from the Knowledge Graph (KG) into your webpages. Structured data helps search engines better understand your content, potentially leading to rich snippets in search results and improved SEO.\n",
    "\n",
    "For this notebook, we're using a pre-configured KG on a demo e-commerce website. We'll be referencing a fictitious URL: `https://data-science-with-python-for-seo.wordlift.dev`.\n",
    "\n",
    "When calling WordLift's data API, we simply pass a URL and receive the corresponding JSON-LD (JavaScript Object Notation for Linked Data). This structured data typically includes information such as product details, pricing, and availability for e-commerce sites.\n",
    "\n",
    "The `get_json_ld_from_url()` function below demonstrates this process. It takes a URL as input and returns the structured data in JSON-LD format, ready to be injected into your webpage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_json_ld_from_url(url):\n",
    "    # Construct the API URL by prefixing with 'https://api.wordlift.io/data/https/'\n",
    "    api_url = \"https://api.wordlift.io/data/https/\" + url.replace(\n",
    "        \"https://\", \"\"\n",
    "    )\n",
    "\n",
    "    # Make the GET request to the API\n",
    "    response = requests.get(api_url)\n",
    "\n",
    "    # Check if the request was successful\n",
    "    if response.status_code == 200:\n",
    "        # Parse the JSON-LD from the response\n",
    "        json_ld = response.json()\n",
    "        return json_ld\n",
    "    else:\n",
    "        print(f\"Failed to retrieve data: {response.status_code}\")\n",
    "        return None\n",
    "\n",
    "\n",
    "def pretty_print_json(json_obj):\n",
    "    # Pretty print the JSON object\n",
    "    print(json.dumps(json_obj, indent=4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's run a test\n",
    "url = \"https://data-science-with-python-for-seo.wordlift.dev/product/100-pure-deluxe-travel-pack-duo-2/\"\n",
    "json_ld = get_json_ld_from_url(url)\n",
    "json_ld"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generating Links of Similar Products using WordLift Neural Search\n",
    "\n",
    "With our product embeddings in place, we can now leverage WordLift's Neural Search capabilities to recommend similar products to users. This feature significantly enhances user engagement and can potentially boost sales by showcasing relevant products based on semantic similarity.\n",
    "\n",
    "Unlike traditional keyword matching, semantic similarity considers the context and meaning of product descriptions. This approach allows for more nuanced and accurate recommendations, even when products don't share exact keywords.\n",
    "\n",
    "The `get_top_k_similar_urls` function we've defined earlier implements this functionality. It takes a product URL and returns a list of semantically similar products, ranked by their similarity scores.\n",
    "\n",
    "For example, if a user is viewing a red cotton t-shirt, this feature might recommend other cotton t-shirts in different colors, or similar style tops made from different materials. This creates a more intuitive and engaging shopping experience for the user.\n",
    "\n",
    "By implementing this Neural Search feature, we're able to create a more personalized and efficient shopping experience, potentially leading to increased user satisfaction and higher conversion rates.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "async def get_top_k_similar_urls(configuration, query_url: str, top_k: int):\n",
    "    request = VectorSearchQueryRequest(\n",
    "        query_url=query_url,\n",
    "        similarity_top_k=top_k,\n",
    "    )\n",
    "\n",
    "    async with wordlift_client.ApiClient(configuration) as api_client:\n",
    "        api_instance = VectorSearchQueriesApi(api_client)\n",
    "        try:\n",
    "            page = await api_instance.create_query(\n",
    "                vector_search_query_request=request\n",
    "            )\n",
    "            return [\n",
    "                {\n",
    "                    \"url\": item.id,\n",
    "                    \"name\": item.text.split(\"\\n\")[0],\n",
    "                    \"score\": item.score,\n",
    "                }\n",
    "                for item in page.items\n",
    "                if item.id and item.text\n",
    "            ]\n",
    "        except Exception as e:\n",
    "            logger.error(f\"Error querying for entities: {e}\", exc_info=True)\n",
    "            return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_k = 10\n",
    "url = \"https://data-science-with-python-for-seo.wordlift.dev/product/100-mineral-sunscreen-spf-30/\"\n",
    "similar_urls = await get_top_k_similar_urls(\n",
    "    configuration, query_url=url, top_k=top_k\n",
    ")\n",
    "print(json.dumps(similar_urls, indent=2))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building a Chatbot for the E-commerce Website using LlamaIndex 🦙\n",
    "\n",
    "The Knowledge Graph we've created serves as a perfect foundation for building an intelligent chatbot. LlamaIndex (formerly GPT Index) is a powerful data framework that allows us to ingest, structure, and access private or domain-specific data in Large Language Models (LLMs). With LlamaIndex, we can create a context-aware chatbot that understands our product catalog and can assist customers effectively.\n",
    "\n",
    "By leveraging LlamaIndex in conjunction with our Knowledge Graph, we can develop a chatbot that responds to direct queries. This chatbot will have an understanding of the product catalog, enabling it to:\n",
    "\n",
    "1. Answer questions about product specifications, availability, and pricing\n",
    "2. Make personalized product recommendations based on customer preferences\n",
    "3. Provide comparisons between similar products\n",
    "\n",
    "This approach leads to more natural and helpful interactions with customers, enhancing their shopping experience. The chatbot can draw upon the structured data in our Knowledge Graph, using LlamaIndex to efficiently retrieve and present relevant information through the LLM.\n",
    "\n",
    "In the following sections, we'll walk through the process of setting up LlamaIndex with our Knowledge Graph data and creating a chatbot that can intelligently assist our e-commerce customers."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Installing `LlamaIndex` and `WordLiftVectorStore` 💪"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip install llama-index\n",
    "!pip install -U 'git+https://github.com/wordlift/llama_index.git#egg=llama-index-vector-stores-wordlift&subdirectory=llama-index-integrations/vector_stores/llama-index-vector-stores-wordlift'\n",
    "!pip install llama-index-embeddings-nomic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import the necessary modules\n",
    "from llama_index.vector_stores.wordlift import WordliftVectorStore\n",
    "from llama_index.core import VectorStoreIndex"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setting NomicEmbeddings for our Query Engine\n",
    "\n",
    "Nomic has released v1.5 🪆🪆🪆 of their embedding model, which brings significant improvements to text embedding capabilities. Embeddings are numerical representations of text that capture semantic meaning, allowing our system to understand and compare the content of queries and documents.\n",
    "\n",
    "Key features of **Nomic v1.5** include:\n",
    "\n",
    "- Variable-sized embeddings with dimensions between 64 and 768\n",
    "- Matryoshka learning, which allows for nested representations\n",
    "- An expanded context size of 8192 tokens\n",
    "\n",
    "We use NomicEmbeddings in WordLift due to these advanced features, and now we're configuring LlamaIndex to use it as well when encoding user queries. This consistency in embedding models across our stack ensures better alignment between our Knowledge Graph and the query understanding process.\n",
    "\n",
    "More information on NomicEmbeddings can be found [here](https://www.nomic.ai/blog/posts/nomic-embed-matryoshka).\n",
    "\n",
    "Go here to [get your free key](https://atlas.nomic.ai/)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.embeddings.nomic import NomicEmbedding\n",
    "\n",
    "nomic_api_key = os.getenv(\"NOMIC_KEY\")\n",
    "\n",
    "embed_model = NomicEmbedding(\n",
    "    api_key=nomic_api_key,\n",
    "    dimensionality=128,\n",
    "    model_name=\"nomic-embed-text-v1.5\",\n",
    ")\n",
    "\n",
    "embedding = embed_model.get_text_embedding(\"Hey Ho SEO!\")\n",
    "len(embedding)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will use OpenAI as default LLM for generating response. We could of course use any other available LLM.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set the environment variable\n",
    "os.environ[\"OPENAI_API_KEY\"] = OPENAI_KEY"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's setup now WordliftVectorStore using data from our Knowledge Graph."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's configure WordliftVectorStore using our WL Key\n",
    "vector_store = WordliftVectorStore(key=API_KEY)\n",
    "\n",
    "# Create an index from the vector store\n",
    "index = VectorStoreIndex.from_vector_store(\n",
    "    vector_store, embed_model=embed_model\n",
    ")\n",
    "\n",
    "# Create a query engine\n",
    "query_engine = index.as_query_engine()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query1 = \"Can you give me a product similar to the facial puff? Please add the URL also\"\n",
    "result1 = query_engine.query(query1)\n",
    "\n",
    "print(result1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to handle queries\n",
    "def query_engine(query):\n",
    "    # Create an index from the vector store\n",
    "    index = VectorStoreIndex.from_vector_store(\n",
    "        vector_store, embed_model=embed_model\n",
    "    )\n",
    "\n",
    "    # Create a query engine\n",
    "    query_engine = index.as_query_engine()\n",
    "    response = query_engine.query(query)\n",
    "    return response\n",
    "\n",
    "\n",
    "# Interactive query loop\n",
    "while True:\n",
    "    user_query = input(\"Enter your query (or 'quit' to exit): \")\n",
    "    if user_query.lower() == \"quit\":\n",
    "        break\n",
    "    result = query_engine(user_query)\n",
    "    print(result)\n",
    "    print(\"\\n---\\n\")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "private_outputs": true,
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
